Udemy Data Science & Machine Learning Bootcamp</mark></center>
# For displaying the figures:
from IPython import display
import numpy as np
# From Python 1-D List to a 1-D NumPy Array:
l1 = [1, 2, 3]
arr_1d = np.array(l1)
print(l1)
print(arr_1d)
print()
# From Python 2-D List to a 2-D NumPy Array:
l2 = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
arr_2d = np.array(l2)
print(l2)
print(arr_2d)
# A range of numbers in a Python List Vs a NumPy Array:
lr = list(range(0, 11, 2))
nr = np.arange(0, 11, 2)
print(lr)
print(nr)
# Generating a NumPy array with all ones or zeros:
print(np.ones(3))
print(np.ones((3, 3))) # Notice the dimensions as a Tuple
print(np.zeros(3))
print(np.zeros((3, 3)))
# Evenly separated numbers in a NumPy Array:
al = np.linspace(0, 20, 5)
an = np.array(al)
print(al)
print(type(al))
print(an)
print(type(an))
# A Singular matrix in NumPy:
np.eye(3)
# Random Numbers Matrices using NumPy Arrays:
arn1 = np.random.rand(5)
arn2 = np.random.rand(2, 3)
arn3 = np.random.rand(1, 3, 2)
print(arn1)
print(arn2)
print(arn3)
# Random Integer within a certain range:
rn1 = np.random.randint(15)
rn2 = np.random.randint(2, 10)
rn3 = np.random.randint(2, 50, 10)
print(rn1)
print(rn2)
print(rn3)
# Reshape a 1-D Array to a 2-D Matrix:
arr1 = np.arange(9)
arr2 = arr1.reshape(3, 3)
print(arr1)
print(arr2)
# Get the Min, Max values and their Index locations from an NumPy Array:
arn1 = np.random.rand(5)
arn2 = np.random.rand(2, 3)
print(arn1)
print(arn2)
print()
print(arn1.min())
print(arn1.argmin())
print()
print(arn2.min())
print(arn2.argmax())
# Get the shape of a NumPy Array:
arn1 = np.random.rand(5)
arn2 = np.random.rand(2, 3)
print(arn1)
print(arn2)
print()
print(arn1.shape)
print(arn2.shape)
# To get the data type of elements in a NumPy Array:
arn1 = np.random.rand(2, 3)
arr1 = np.arange(9)
arr2 = arr1.reshape(3, 3)
print(arr1)
print(arr2)
print(arn1.dtype)
# Array Indexing and Slicing:
ar1 = np.arange(9)
ar2 = ar1.reshape(3, 3)
print(ar1)
print(ar2)
print()
print(ar1[2])
print(ar1[3:8])
print(ar2[0][1])
print(ar2[2][2])
print()
print(ar2[:2, 1:])
print(ar2[1, :])
print(ar2[:, 1])
# Conditional Seletctions:
ar = np.arange(1, 11)
print(ar)
bool_ar = ar > 4
print(bool_ar)
arc = ar[bool_ar]
print(arc)
print(ar[ar >= 6])
# Adding and Subtracting two or more NumPy Arrays elementwise:
ar1 = np.arange(1, 6)
ar2 = np.arange(6, 11)
print(ar1)
print(ar2)
ara = ar1 + ar2
ars = ar1 - ar2
print(ara)
print(ars)
# A single element gets added/subtracted from an entire NumPy array:
arr = np.arange(21, 31)
print(arr)
print(arr-10)
print(arr+5)
# Universal NumPy Array functions for Mathematical operations:
arr = np.arange(1, 11)
print(np.sqrt(arr))
print(np.exp(arr))
print(np.log(arr))
import pandas as pd
# Converting a List, NumPy array, Dictionary to a Pandas Series:
labels = ['a','b','c'] # Labels
my_list = [10,20,30] # List
np_arr = np.array([10,20,30]) # NumPy Array
d = {'a':10,'b':20,'c':30} # Dictionary
# List to Series
s1 = pd.Series(my_list)
# List to Series with index
s2 = pd.Series(my_list, labels)
# NumPy array to Series with index
s3 = pd.Series(np_arr, labels)
# Dictionary to Series- Keys as index labels
s4 = pd.Series(d)
print(s1, '\n***')
print(s2, '\n***')
print(s3, '\n***')
print(s4, '\n***')
# Using Series Index to access data points:
# First Subject
students_marks_ce1010 = pd.Series([27, 23, 19, 29, 30], ['John', 'Tim', 'Steve', 'Mike', 'Kevin'])
# Second Subject
students_marks_ce5410 = pd.Series([23, 25, 22, 24, 30], ['John', 'Tim', 'Steve', 'Mike', 'Kevin'])
print(students_marks_ce1010, '\n')
print(students_marks_ce5410, '\n')
# Accessing Scores
print(students_marks_ce1010['Tim'],'\n')
# Merging Series based off Index
total_marks = students_marks_ce1010 + students_marks_ce5410
print('Total Marks from both the Courses:\n',total_marks)
from numpy.random import randn
np.random.seed(101)
# Creating a DataFrame:
df = pd.DataFrame(randn(5,4), index='A B C D E'.split(), columns='W X Y Z'.split())
df
# Single column
print(df['W'])
print(type(df['W']), '\n')
# Multiple columns
print(df[['W', 'X']], '\n')
# Removing rows and columns (use axis=1)
df1 = df.drop('E')
df2 = df.drop('Z', axis=1)
print(df.drop('E'), '\n')
print(df.drop('Z', axis=1), '\n')
print('df1: \n',df1, '\n')
print('df2: \n',df2, '\n')
print('df: \n',df)
df.drop('Z', axis=1, inplace=True)
print('Updated df: \n',df)
# Adding new Columns
df['Z1'] = df['W'] + df['Y']
df
# Accessing Rows using loc
print(df.loc['A'], '\n') # Single Row
print(df.loc[['A', 'C']], '\n') # Two different Rows
print(df.loc['A':'C'], '\n') # Slicing of Rows
# Accessing Rows using iloc
print(df.iloc[0], '\n') # Single Row
print(df.iloc[[0, 2]], '\n') # Two different Rows
print(df.iloc[0:2], '\n') # Slicing of Rows
'''Interesting fact to note is that .loc[m:n] will go from m till n, including n.
However, .iloc[i:j] will go from row / column no. i till no. j-1, excluding j.'''
# A subset of DataFrame can also be obtained
print(df.loc[['A', 'C'], ['W', 'Y']], '\n')
print(df.iloc[0:3,0:2])
# Booleans
print(df>0, '\n')
print(df[df>0], '\n')
# Booleans on a particular column
print(df['W'] < 0, '\n')
print(df[df['W'] < 0], '\n')
df_sub = df[df['W']>0]
print(df_sub, '\n')
# We can also use multiple conditionals
print(df[(df['W']>0) & (df['Y'] < 0)], '\n') # AND
print(df[(df['W']>0) | (df['Y'] < 0)], '\n') # OR
# Reset the Index: To the default index 0, 1, 2,...
print('Current DataFrame: \n',df)
print('\n',df.reset_index())
print('\n',df)
df.reset_index(inplace=True)
print('\n',df)
# Set a particular column as index
labels = [1, 2, 3, 4, 5]
df.set_index('index', inplace=True)
print('\n',df)
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
# Forming the MultiIndex first
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df
df.loc['G1']
df.loc['G1']['A']
df.loc['G2'].loc[1]
df.index.names
df.index.names = ['Group','Num']
df
df.xs('G1')
df.xs(['G1',1])
df.xs(1,level='Num')
df = pd.DataFrame({'A':[1,2,np.nan],
'B':[5,np.nan,np.nan],
'C':[1,2,3]})
df
# Finding the Missing Values:
df.isna()
# Dropping Missing values across rows:
df.dropna(inplace=False)
# Dropping Missing values across columns:
df.dropna(axis = 1, inplace=False)
# Dropping only if Missing values are above a threshold 'x':
df.dropna(thresh=2)
# Filling in all Missing Values with a column parameter:
df['B'].fillna(df['B'].mean(), inplace=True)
df
# Create dataframe
data = {'Company':['GOOGLE','GOOGLE','AMAZON','AMAZON','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
dfg = pd.DataFrame(data)
dfg
# Forming a grouby based off a particular column:
df_by_company = dfg.groupby('Company')
# Calling the aggregate functions:
df_by_company.mean()
df_by_company.sum()
df_by_company.describe().transpose()
df_by_company.describe().transpose()['GOOGLE']
# Creating DataFrames:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
# Vertical Concatenation
pd.concat([df1, df2, df3])
# Horizontal Concatenation
pd.concat([df1, df2, df3], axis=1)
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
pd.merge(left,right,how='inner',on='key')
Or to show a more complicated example:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
pd.merge(left, right, on=['key1', 'key2'])
pd.merge(left, right, how='outer', on=['key1', 'key2'])
pd.merge(left, right, how='right', on=['key1', 'key2'])
pd.merge(left, right, how='left', on=['key1', 'key2'])
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
left.join(right)
left.join(right, how='outer')
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
left.join(right)
left.join(right, how='outer')
# Creating a DataFrame:
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()
# Basic information about the DataFrame:
df.info()
# The list of all columns labels
df.columns
# The list of all row labels
df.index
# Finding unique values in a column of a DataFrame:
df['col1'].unique()
# Finding the count of unique values in a column of a DataFrame:
df['col2'].nunique()
# Finding the frequency of values in a column of a DataFrame:
df['col2'].value_counts()
# Finding the Null values in the DataFrame as a boolean
df.isnull()
# Finding the values from a column satisfying a given condition:
df[df['col1']>1]
# Select from DataFrame using criteria from multiple columns:
df[(df['col1']>1) & (df['col2']>450)]
# Applying a function on each value in a column:
def modulus(x):
return x%10
df['col2'].apply(modulus)
df['col3'].apply(len)
# Drop a column
df.drop('col1', axis=1, inplace=False)
# Drop a row
df.drop(1, inplace=False)
# Permanently delete a column
del df['col2']
df
# Permanently add a column
df['col2'] = [444, 555, 666, 444]
df
df.sort_values(by='col2')
data = {'A':['foo','foo','foo','bar','bar','bar'],
'B':['one','one','two','two','one','one'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,5,4,1]}
df = pd.DataFrame(data)
df
df.pivot_table(values='D',index=['A', 'B'],columns=['C'])
df1 = pd.read_csv('example')
df1
df1.to_csv('example',index=False)
df2 = pd.read_excel('Excel_Sample.xlsx')
df2
df2.to_excel('Excel_Sample.xlsx')
import matplotlib.pyplot as plt
%matplotlib inline
# %matplotlib notebook
x = np.linspace(0, 5, 11)
y = x ** 2
plt.plot(x, y, 'r--o')
plt.xlabel('X Axis Title Here')
plt.ylabel('Y Axis Title Here')
plt.title('String Title Here');
# Multiple Subplots
# plt.subplot(nrows, ncols, plot_number)
plt.subplot(1,2,1)
plt.plot(x, y, 'r--') # More on color options later
plt.subplot(1,2,2)
plt.plot(y, x, 'g*-');
# Create Figure (empty canvas)
fig = plt.figure()
# Add set of axes to figure
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # left, bottom, width, height (range 0 to 1)
# Plot on that set of axes
axes.plot(x, y, 'b-*')
axes.set_xlabel('Set X Label') # Notice the use of set_ to begin methods
axes.set_ylabel('Set y Label')
axes.set_title('Set Title')
# Creates blank canvas
fig = plt.figure()
axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # main axes
axes2 = fig.add_axes([0.2, 0.5, 0.4, 0.3]) # inset axes
# Larger Figure Axes 1
axes1.plot(x, y, 'b-s')
axes1.set_xlabel('X_label_axes2')
axes1.set_ylabel('Y_label_axes2')
axes1.set_title('Axes 2 Title')
# Insert Figure Axes 2
axes2.plot(y, x, 'r-.')
axes2.set_xlabel('X_label_axes2')
axes2.set_ylabel('Y_label_axes2')
axes2.set_title('Axes 2 Title');
# We can create a tuple unpacking where the 'axes' is basically an array of all the axes
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14,4))
# We can now call on each of the axes and plot
axes[0].plot(x, x**2, '--', 'r', label='x**2')
axes[1].plot(x, x**3, 'b-*', label='x**3')
axes[2].plot(x, x**2-5*x+6, 'g-s', label='x**2-5x+6')
# Set the Title and axes labels
axes[0].set_title('Linear')
axes[1].set_title('Cubic')
axes[2].set_title('Quadratic')
axes[0].set_xlabel('x')
axes[0].set_ylabel('y')
axes[1].set_xlabel('x')
axes[1].set_ylabel('y')
axes[2].set_xlabel('x')
axes[2].set_ylabel('y')
for ax in axes:
ax.legend(loc=0) # loc=0 finds the best location for the legend
plt.tight_layout()
linear_data = np.array([1,2,3,4,5,6,7,8])
exponential_data = linear_data**2
plt.figure()
# plot the linear data and the exponential data
plt.plot(linear_data, '-o', exponential_data, '-o')
# fill the area between the linear data and exponential data
plt.gca().fill_between(range(len(linear_data)),
linear_data, exponential_data,
facecolor='blue',
alpha=0.25);
# # Seaborn comes with built-in data sets!
df = sns.load_dataset('tips')
df.head()
df.plot.box();
df.plot.hist();
fig = plt.figure(figsize=(5, 4),dpi=80)
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(x, x**2)
ax.set_title('Quadratic')
ax.set_xlabel('x')
ax.set_ylabel('y');
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].plot(x, x**2, x, x**3)
axes[0].set_title("default axes ranges")
axes[1].plot(x, x**2, x, x**3)
axes[1].axis('tight')
axes[1].set_title("tight axes")
axes[2].plot(x, x**2, x, x**3)
axes[2].set_ylim([0, 60])
axes[2].set_xlim([2, 5])
axes[2].set_title("custom axes range");
plt.tight_layout()
import seaborn as sns
sns.set_style('darkgrid')
# # Seaborn comes with built-in data sets!
tips = sns.load_dataset('tips')
tips.head()
%%timeit -n 100 # IPython Magic function
sns.distplot(tips['total_bill'], kde=False);
# We can also vary the bins size to get a more detailed distribution
sns.distplot(tips['total_bill'], kde=False, bins=40);
# The default scatter kind plot
sns.jointplot(x='total_bill',y='tip',data=tips);
# The hexagonal plot
sns.jointplot(x='total_bill',y='tip',data=tips, kind='hex');
# The kde kind plot
sns.jointplot(x='total_bill',y='tip',data=tips, kind='kde');
# The regression plot
sns.jointplot(x='total_bill',y='tip',data=tips, kind='reg');
# The resid kind plot
sns.jointplot(x='total_bill',y='tip',data=tips, kind='resid');
sns.pairplot(tips);
sns.pairplot(tips, hue='sex');
sns.rugplot(tips['total_bill']);
# Barplot with a Categorical variable on the x-axis and a numerical variable on the y-axis compaing means
sns.barplot(x='sex', y='total_bill', data=tips);
# Barplot with a Categorical variable on the x-axis and a numerical variable on the y-axis comparing std dev
sns.barplot(x='sex', y='total_bill', data=tips, estimator=np.std);
# Barplot with a Categorical variable on the x-axis and a numerical variable on the y-axis comparing median
sns.barplot(x='sex', y='total_bill', data=tips, estimator=np.median);
# Counplot for categorical variables
sns.countplot(tips['sex']);
# Counplot for categorical variables with hue
sns.countplot(tips['size'], hue=tips['sex']);
# A boxplot with categorical variable on the x-axis and numerical varaiable on the y-axis
sns.boxplot(x='day', y='total_bill', data=tips);
# A boxplot with categorical variable on the x-axis and numerical varaiable on the y-axis with hue on sex
sns.boxplot(x='day', y='total_bill', data=tips, hue='sex');
# A boxplot with categorical variable on the x-axis and numerical varaiable on the y-axis with hue on smoker
sns.boxplot(x='day', y='total_bill', data=tips, hue='smoker');
# A violinplot with exact same arguments as the boxplot
sns.violinplot(x='day', y='total_bill', data=tips, hue='smoker');
# We can also merge these two hues with split
sns.violinplot(x='day', y='total_bill', data=tips, hue='smoker', split=True);
sns.factorplot(x='day', y='total_bill', data=tips, kind='bar');
flights = sns.load_dataset('flights')
flights.head()
sns.heatmap(tips.corr(), annot=True, cmap='magma', lw=1, linecolor='black');
sns.heatmap(flights.corr(), annot=True, cmap='RdBu_r');
iris = sns.load_dataset('iris')
iris.head()
# First create a PairGrid
g = sns.PairGrid(iris)
# Now plot or map on this PairGrid at upper,lower, and diagonal locations
g.map_diag(sns.distplot); # On the diagonal
g.map_upper(plt.scatter); # On the upper triangle
g.map_lower(sns.kdeplot); # On the lower triangle
# Create an empty FacetGrid
fg = sns.FacetGrid(tips, col='time', row='sex')
# Now map or plot to this grid with an uniariate plot
fg.map(sns.distplot, 'total_bill');
# Create an empty FacetGrid
fg1 = sns.FacetGrid(tips, col='time', row='sex')
# Now map or plot to this grid with a bivariate plot
fg1.map(sns.scatterplot, 'total_bill', 'tip');
sns.lmplot(x='total_bill', y='tip', data=tips);
# We can include a hue as well
sns.lmplot(x='total_bill', y='tip', data=tips, hue='sex', markers=['o', 'v']);
sns.lmplot(x='total_bill',y='tip',data=tips,col='sex');
# We just need to include the column and rows (no need for any grid)
sns.lmplot(x="total_bill", y="tip", row="sex", col="time",data=tips);
# We can control the height and the width of the plots using aspect and height arguments
sns.lmplot(x='total_bill',y='tip',data=tips,col='day',hue='sex',palette='seismic', aspect=0.8, height=5);
sns.set_style('white')
sns.countplot(tips['sex']);
sns.despine()
# Non Grid Plot
plt.figure(figsize=(6,4))
sns.countplot(x='sex',data=tips);
# Grid Type Plot
sns.lmplot(x='total_bill',y='tip',height=5,aspect=0.8,data=tips);

print('We can get more colormaps and pallettes from:')
display.Image("./Matplotlib Colormaps.png", width=500, height=50)
sns.set_context('notebook', font_scale=1);
sns.countplot(x='sex',data=tips,palette='seismic');
sns.set_style('whitegrid');
sns.set_context('notebook', font_scale=1);
sns.set_palette("RdBu_r")
The main libraries to be used for Machine Learning in Python is the 'Scikit-learn'. The Algorithmic Cheat sheet is as displayed below:
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.linear_model import LinearRegression # The Linear Regression Model
from sklearn import metrics # For Regression Evaluation Metrics
# Importing the dataset for predicting the Price of a house:
df = pd.read_csv('USA_Housing-Copy1.csv')
df.head()
df.info()
df.describe().transpose()
sns.pairplot(df);
fig, axes = plt.subplots(1, 5, figsize=(16, 4))
sns.distplot(df['Avg. Area Income'], ax=axes[0]);
sns.distplot(df['Avg. Area House Age'], ax=axes[1]);
sns.distplot(df['Avg. Area Number of Rooms'], ax=axes[2]);
sns.distplot(df['Avg. Area Number of Bedrooms'], ax=axes[3]);
sns.distplot(df['Area Population'], ax=axes[4]);
plt.tight_layout()
fig, axes = plt.subplots(1, 5, figsize=(14, 4))
sns.boxplot(df['Avg. Area Income'], ax=axes[0], orient='v');
sns.boxplot(df['Avg. Area House Age'], ax=axes[1], orient='v');
sns.boxplot(df['Avg. Area Number of Rooms'], ax=axes[2], orient='v');
sns.boxplot(df['Avg. Area Number of Bedrooms'], ax=axes[3], orient='v');
sns.boxplot(df['Area Population'], ax=axes[4], orient='v');
plt.tight_layout()
sns.heatmap(df.corr(), cmap='magma', annot=True);
# Columns having Numerical data are
numeric_features = df.select_dtypes(include=[np.number])
print(list(numeric_features.columns))
# Creating our X and y arrays yb separating the target labels column:
X = df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population']]
y = df['Price']
# The order of the split variables is important
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
lm = LinearRegression()
lm.fit(X_train, y_train) # Fit the model on the train split
# The intercept of the linear model:
lm.intercept_
# For coefficients can be properly shown in a DataFrame:
# print(lm.coef_)
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df
Interpreting the coefficients:
Does this make sense? Probably not because I made up this data. If you want real data to repeat this sort of analysis, check out the boston dataset:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.DESCR)
boston_df = boston.data
# Let's fetch the y_test lavels:
predictions = lm.predict(X_test)
predictions
sns.jointplot(y_test, predictions, kind='scatter');
Residual Histogram
# A close to normal distribution of Residuals indicates correctness of the model
sns.distplot((y_test-predictions), bins=50);
Here are three common evaluation metrics for regression problems:
Mean Absolute Error (MAE) is the mean of the absolute value of the errors:
$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$Mean Squared Error (MSE) is the mean of the squared errors:
$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$Comparing these metrics:
All of these are loss functions, because we want to minimize them.
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
# The R^2 value can also be obtained as:
print('R^2 score: ', metrics.explained_variance_score(y_test, predictions))
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.linear_model import LogisticRegression # The Linear Regression Model
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
# Importing the dataset for predicting the Price of a house:
df = pd.read_csv('titanic_train-Copy1.csv')
df.head()
df.info()
df.describe().transpose()
df.nunique()
dfn = df.isnull().describe().transpose()
dfn[dfn['unique']==2]
# We can also use heatmaps to identify missing data
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='seismic');
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=df,palette='winter');
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 37
elif Pclass == 2:
return 29
else:
return 24
else:
return Age
df['Age'] = df[['Age','Pclass']].apply(impute_age,axis=1)
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='seismic');
df.drop('Cabin',axis=1,inplace=True)
df.head()
df.dropna(inplace=True)
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='seismic');
# Columns having Numerical data are
numeric_features = df.select_dtypes(include=[np.number])
print(list(numeric_features.columns))
sns.pairplot(df);
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sns.distplot(df['Age'], ax=axes[0]);
sns.distplot(df['Fare'], ax=axes[1]);
plt.tight_layout()
fig, axes = plt.subplots(2, 3, figsize=(12, 6))
sns.countplot(df['Survived'], ax=axes[0][0]);
sns.countplot(df['Pclass'], ax=axes[0][1]);
sns.countplot(df['Sex'], ax=axes[0][2]);
sns.countplot(df['SibSp'], ax=axes[1][0]);
sns.countplot(df['Parch'], ax=axes[1][1]);
sns.countplot(df['Embarked'], ax=axes[1][2]);
plt.tight_layout()
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
sns.boxplot(df['Age'], ax=axes[0], hue=df['Survived'], orient='v');
sns.countplot(df['Sex'], ax=axes[1], hue=df['Survived']);
sns.countplot(df['Pclass'], ax=axes[2], hue=df['Survived']);
plt.tight_layout()
df.info()
gender = pd.get_dummies(df['Sex'], drop_first=True)
embark = pd.get_dummies(df['Embarked'], drop_first=True)
# pclass = pd.get_dummies(df['Pclass'], drop_first=True)
df.drop(['Sex', 'Embarked','Name','Ticket'], axis=1, inplace=True)
df = pd.concat([df, gender, embark], axis=1)
df.drop('PassengerId', axis=1, inplace=True)
df.head()
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived',axis=1), df['Survived'], test_size=0.30, random_state=101)
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train);
predictions = logmodel.predict(X_test);
print(classification_report(y_test,predictions))
print('The Confusion Matrix is: \n')
matrixc = confusion_matrix(y_test, predictions)
print(matrixc)
matrix = plot_confusion_matrix(logmodel ,X_test, y_test, values_format='d')
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.gcf().axes[0].tick_params(color='black')
plt.gcf().set_size_inches(4,3)
plt.show()
# Importing the required libraries:
import cufflinks as cf
cf.go_offline()
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.preprocessing import StandardScaler # For scaling the dataset
from sklearn.neighbors import KNeighborsClassifier # The KNN Model
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
df = pd.read_csv('KNN_Project_Data-Copy1')
df.head()
df.info()
df.describe().transpose()
df.nunique()
lc = list(df.columns)
fig, axes = plt.subplots(2, 5, figsize=(16, 4))
j = 0
c = 0
for i in range(len(lc)-1):
sns.distplot(df[lc[i]], ax=axes[c][j]);
j += 1
if i == 4:
c, j = 1, 0
plt.tight_layout()
# Create a StandardScaler:
scaler = StandardScaler()
# Fit the Scaler to the dataset excluding the labels:
scaler.fit(df.drop('TARGET CLASS', axis=1))
# Scale or standardise the dataset:
scaled = scaler.transform(df.drop('TARGET CLASS', axis=1))
# Recreate the DataFrame
df1 = pd.DataFrame(scaled, columns = df.columns[:-1])
df1.head()
X_train, X_test, y_train, y_test = train_test_split(df1,df['TARGET CLASS'], test_size=0.30)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(classification_report(y_test,predictions))
print('The Confusion Matrix is: \n')
matrixc = confusion_matrix(y_test, predictions)
print(matrixc)
matrix = plot_confusion_matrix(knn ,X_test, y_test, values_format='d')
matrix.ax_.set_title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.gcf().axes[0].tick_params(color='black')
plt.gcf().set_size_inches(4,3)
plt.show()
error_rate = []
d = dict()
for i in range(1, 51):
knn_c = KNeighborsClassifier(n_neighbors=i)
knn_c.fit(X_train, y_train)
pred_i = knn_c.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
d[i] = np.mean(pred_i != y_test)
sns.jointplot(range(1, 51), error_rate, size=5);
mn = min(d.values())
for i in d:
if d[i] == mn:
print(i,':', mn)
# KNN With K=1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('KNN with K=1:')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
print('*******************************************************')
# KNN With K=31
knn = KNeighborsClassifier(n_neighbors=31)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('KNN with K=31:')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.tree import DecisionTreeClassifier # For Decision Trees
from sklearn.ensemble import RandomForestClassifier # For Random Forest
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
df = pd.read_csv('kyphosis-Copy1.csv')
df.head()
df.info()
df.describe().transpose()
# Columns having Numerical data are
numeric_features = df.select_dtypes(include=[np.number])
print(list(numeric_features.columns))
sns.pairplot(df, hue='Kyphosis');
df.nunique()
X = df[['Age', 'Number', 'Start']]
y = df['Kyphosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))
print('The Confusion Matrix is: \n')
matrixc = confusion_matrix(y_test, predictions)
print(matrixc)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
print(classification_report(y_test, rfc_predictions))
print('The Confusion Matrix is: \n')
matrixc = confusion_matrix(y_test, rfc_predictions)
print(matrixc)
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.svm import SVC # The Support Vector Classifier
from sklearn.model_selection import GridSearchCV # For Grid Search
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer['DESCR'])
# The main features DataFrame
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df.head()
# The prediction labels DataFrame
df_target = pd.DataFrame(cancer['target'],columns=['Cancer'])
df.info()
df.describe().transpose()
# sns.pairplot(df);
X_train, X_test, y_train, y_test = train_test_split(df, cancer['target'], test_size=0.30, random_state=101)
svcmodel = SVC()
svcmodel.fit(X_train, y_train)
predictions = svc.predict(X_test)
print(classification_report(y_test,predictions))
print('The Confusion Matrix is: \n')
matrixc = confusion_matrix(y_test, predictions)
print(matrixc)
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
# May take a while!
grid.fit(X_train,y_train)
grid.best_params_
grid.best_estimator_
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.cluster import KMeans # For KMeans Clustering
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
from sklearn.datasets import make_blobs
# Creating a Random Clustered dataset
data = make_blobs(n_samples=2000, n_features=2, centers=4, cluster_std=1.8,random_state=101)
plt.scatter(data[0][:,0],data[0][:,1],c=data[1],cmap='rainbow');
kmeans = KMeans(n_clusters=4)
# Fit the Model to the features
kmeans.fit(data[0])
kmeans.cluster_centers_
kmeans.labels_
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(10,6))
ax1.set_title('K Means')
ax1.scatter(data[0][:,0],data[0][:,1],c=kmeans.labels_,cmap='rainbow');
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1],cmap='rainbow');
## <font color='darkred'> **12) K Means Clustering:-** <a name="p12"></a> </font>
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.preprocessing import StandardScaler # For scaling the data
from sklearn.decomposition import PCA # For PCA
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer.keys()
print(cancer['DESCR'])
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
#(['DESCR', 'data', 'feature_names', 'target_names', 'target'])
df.head()
# Scaling the data
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)
pca = PCA(n_components=2)
pca.fit(scaled_data)
Now we can transform this data to its first 2 principal components.
x_pca = pca.transform(scaled_data)
scaled_data.shape
x_pca.shape
Great! We've reduced 30 dimensions to just 2! Let's plot these two dimensions out!
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'],cmap='plasma');
plt.xlabel('First principal component');
plt.ylabel('Second Principal Component');
Clearly by using these two components we can easily separate these two classes.
Unfortunately, with this great power of dimensionality reduction, comes the cost of being able to easily understand what these components represent.
The components correspond to combinations of the original features, the components themselves are stored as an attribute of the fitted PCA object:
pca.components_
df_comp = pd.DataFrame(pca.components_,columns=cancer['feature_names'])
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma',);
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.cluster import KMeans # For KMeans Clustering
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
# Importing the dataset
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u-Copy1.data', sep='\t', names=column_names)
# Getting the Movie Titles as well:
movie_titles = pd.read_csv("Movie_Id_Titles-Copy1")
movie_titles.head()
# Merging the DataFrames:
df = pd.merge(df,movie_titles,on='item_id')
df.head()
df.info()
df.nunique()
# Average Rating for each movie
meanr = pd.DataFrame(df.groupby('title').mean()['rating'])
meanr.sort_values(by='rating', ascending=False)
# Number of Ratings for each movie
countr = pd.DataFrame(df.groupby('title').count()['rating'])
countr.sort_values(by='rating', ascending=False)
dfr = pd.merge(meanr, countr, on='title')
dfr.rename(columns = {'rating_x': 'avg rating', 'rating_y': 'no of ratings'}, inplace = True)
dfr.head()
sns.distplot(dfr['avg rating'], bins=50);
sns.jointplot(dfr['avg rating'], dfr['no of ratings']);
moviemat = df.pivot_table(index='user_id',columns='title',values='rating')
moviemat.head()
starwars_user_ratings = moviemat['Star Wars (1977)']
liarliar_user_ratings = moviemat['Liar Liar (1997)']
starwars_user_ratings.head()
similar_to_starwars = moviemat.corrwith(starwars_user_ratings);
similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings);
corr_starwars = pd.DataFrame(similar_to_starwars,columns=['Correlation'])
corr_starwars.dropna(inplace=True)
corr_starwars.head()
corr_starwars.sort_values('Correlation',ascending=False).head(10)
corr_starwars = corr_starwars.join(dfr['no of ratings'])
corr_starwars.head()
corr_starwars[corr_starwars['no of ratings']>100].sort_values('Correlation',ascending=False).head()
corr_liarliar = pd.DataFrame(similar_to_liarliar,columns=['Correlation'])
corr_liarliar.dropna(inplace=True)
corr_liarliar = corr_liarliar.join(dfr['no of ratings'])
corr_liarliar[corr_liarliar['no of ratings']>100].sort_values('Correlation',ascending=False).head()
# Importing new libraries from scipy
from scipy import stats
df = pd.read_csv('grades.csv')
df.head()
df.describe().T
df.info()
early_finishers = df[df['assignment1_submission'].apply(pd.to_datetime) < '2016']
early_finishers.head()
late_finsihers = df[df['assignment1_submission'].apply(pd.to_datetime) > '2016']
late_finsihers.head()
# Another solution. First, the dataframe df and the early_finishers share index values, so I really just
# want everything in the df which is not in early_finishers
late_finishers=df[~df.index.isin(early_finishers.index)]
late_finishers.head()
early_finishers.describe().T
late_finishers.describe().T
# Let's bring in our ttest_ind function
from scipy.stats import ttest_ind
# Let's run this function with our two populations, looking at the assignment 1 grades
ttest_ind(early_finishers['assignment1_grade'], late_finishers['assignment1_grade'])
# Checking the other assignment grades as well:
print(ttest_ind(early_finishers['assignment2_grade'], late_finishers['assignment2_grade']))
print(ttest_ind(early_finishers['assignment3_grade'], late_finishers['assignment3_grade']))
print(ttest_ind(early_finishers['assignment4_grade'], late_finishers['assignment4_grade']))
print(ttest_ind(early_finishers['assignment5_grade'], late_finishers['assignment5_grade']))
print(ttest_ind(early_finishers['assignment6_grade'], late_finishers['assignment6_grade']))
# Importing the required libraries:
from sklearn.model_selection import train_test_split # For Train-Test split
from sklearn.preprocessing import StandardScaler # For scaling the dataset
from sklearn.naive_bayes import GaussianNB # The Gaussian Naive Bayes
from sklearn.metrics import classification_report # For Classification Report
from sklearn.metrics import plot_confusion_matrix # For Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix # For Confusion Matrix
df = pd.read_csv('diabetes.csv')
df.head()
df.info()
df.describe().T
sns.pairplot(df);
X_train, X_test, y_train, y_test = train_test_split(df.drop('Outcome', axis=1), df['Outcome'], test_size=0.3, random_state=42)
nb = GaussianNB()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
nb.score(X_test, y_test)
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))
# Importing the required libraries:
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas.tseries.offsets import Hour, Minute
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)
today = datetime.now()
today
print(today.date())
print(today.day)
print(today.month)
print(today.year)
print(today.hour)
print(today.minute)
print(today.second)
print(today.astimezone())
print(today.timestamp())
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
print(delta)
print(delta.days)
print(delta.seconds)
# Using timedelta
start = datetime(2011, 1, 7)
start + timedelta(12)
start - 2*timedelta(12)
stamp = datetime(year=2020, month=6, day=28)
str(stamp)
# Conversion from timestamp to string
s1 = stamp.strftime('%y-%m-%d')
s2 = stamp.strftime('%Y-%m-%d')
print(s1)
print(s2)
# Conversion from string to timestamp
value = '2020-06-28'
d1 = datetime.strptime(value, '%Y-%m-%d')
d1
d2 = parse('Jan 31, 1997 10:45 PM')
d2
d3 = parse('6/12/2011', dayfirst=True)
d3
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(datestrs)
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
datetime(2011, 1, 7), datetime(2011, 1, 8),
datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts.head()
ts.index
ts.index.dtype
stamp = ts.index[0]
stamp
# We can use Timestamp indices in many different formats
print(ts['1/10/2011'])
print(ts['20110110'])
print(ts['2011-01-10'])
longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
longer_ts
# Selecting only the indices from May 2001
longer_ts['2001-05']
ts['1/6/2011':'1/11/2011']
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
'1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts
# Checking if all the index timestamps are unique
dup_ts.index.is_unique
print(dup_ts['1/3/2000']) # not duplicated
print()
print(dup_ts['1/2/2000']) # duplicated
# Using groupby for duplicate indices
grouped = dup_ts.groupby(level=0)
grouped.mean()
grouped.count()
ts
resampler = ts.resample('D') # 'D' indicates daily frequency
index = pd.date_range('2012-04-01', '2012-06-01')
index
# From start date, 20 periods
pd.date_range(start='2012-04-01', periods=20)
# Reverse from end date, 20 periods
pd.date_range(end='2012-06-01', periods=20)
pd.date_range('2000-01-01', '2000-12-01', freq='BM')
pd.date_range('2012-05-02 12:56:31', periods=5)
pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)
pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4h')
pd.date_range('2000-01-01', periods=10, freq='1h30min')
ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4, freq='M'))
print(ts)
print()
ts.shift(2)
ts.shift(-2)
import pytz
pytz.common_timezones[-5:]
tz = pytz.timezone('America/New_York')
tz
rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
print(ts.index.tz)
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')
ts
ts_utc = ts.tz_localize('UTC')
ts_utc
ts_utc.index
ts_utc.tz_convert('America/New_York')
ts_eastern = ts.tz_localize('America/New_York')
ts_eastern.tz_convert('UTC')
ts_eastern.tz_convert('Europe/Berlin')
ts.index.tz_localize('Asia/Shanghai')
stamp = pd.Timestamp('2011-03-12 04:00')
stamp_utc = stamp.tz_localize('utc')
stamp_utc.tz_convert('America/New_York')
stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')
stamp_moscow
stamp_utc.value
stamp_utc.tz_convert('America/New_York').value
from pandas.tseries.offsets import Hour
stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')
stamp
stamp + Hour()
stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')
stamp
stamp + 2 * Hour()
rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
ts1 = ts[:7].tz_localize('Europe/London')
ts2 = ts1[2:].tz_convert('Europe/Moscow')
result = ts1 + ts2
result.index
p = pd.Period(2007, freq='A-DEC')
p
p + 5
p - 2
pd.Period('2014', freq='A-DEC') - p
rng = pd.period_range('2000-01-01', '2000-06-30', freq='M')
rng
pd.Series(np.random.randn(6), index=rng)
values = ['2001Q3', '2002Q2', '2003Q1']
index = pd.PeriodIndex(values, freq='Q-DEC')
index
p = pd.Period('2007', freq='A-DEC')
p
p.asfreq('M', how='start')
p.asfreq('M', how='end')
p = pd.Period('2007', freq='A-JUN')
p
p.asfreq('M', 'start')
p.asfreq('M', 'end')
p = pd.Period('Aug-2007', 'M')
p.asfreq('A-JUN')
rng = pd.period_range('2006', '2009', freq='A-DEC')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
ts.asfreq('M', how='start')
ts.asfreq('B', how='end')
p = pd.Period('2012Q4', freq='Q-JAN')
p
p.asfreq('D', 'start')
p.asfreq('D', 'end')
p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60
p4pm
p4pm.to_timestamp()
rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')
ts = pd.Series(np.arange(len(rng)), index=rng)
ts
rng = pd.date_range('2000-01-01', periods=3, freq='M')
ts = pd.Series(np.random.randn(3), index=rng)
ts
pts = ts.to_period()
pts
rng = pd.date_range('1/29/2000', periods=6, freq='D')
ts2 = pd.Series(np.random.randn(6), index=rng)
ts2
ts2.to_period('M')
pts = ts2.to_period()
pts
pts.to_timestamp(how='end')
data = pd.read_csv('macrodata.csv')
data.head(5)
data.year
data.quarter
index = pd.PeriodIndex(year=data.year, quarter=data.quarter,
freq='Q-DEC')
index
data.index = index
data.infl
rng = pd.date_range('2000-01-01', periods=100, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
ts.resample('M').mean()
ts.resample('M', kind='period').mean()
rng = pd.date_range('2000-01-01', periods=12, freq='T')
ts = pd.Series(np.arange(12), index=rng)
ts
ts.resample('5min', closed='right').sum()
ts.resample('5min', closed='right').sum()
ts.resample('5min', closed='right', label='right').sum()
ts.resample('5min', closed='right',
label='right', loffset='-1s').sum()
ts.resample('5min').ohlc()
frame = pd.DataFrame(np.random.randn(2, 4),
index=pd.date_range('1/1/2000', periods=2,
freq='W-WED'),
columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame
df_daily = frame.resample('D').asfreq()
df_daily
frame.resample('D').ffill()
frame.resample('D').ffill(limit=2)
frame.resample('W-THU').ffill()
frame = pd.DataFrame(np.random.randn(24, 4),
index=pd.period_range('1-2000', '12-2001',
freq='M'),
columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame[:5]
annual_frame = frame.resample('A-DEC').mean()
annual_frame
# Q-DEC: Quarterly, year ending in December
annual_frame.resample('Q-DEC').ffill()
annual_frame.resample('Q-DEC', convention='end').ffill()
annual_frame.resample('Q-MAR').ffill()
close_px_all = pd.read_csv('stock_px_2.csv',
parse_dates=True, index_col=0)
close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]
close_px = close_px.resample('B').ffill()
close_px.AAPL.plot()
close_px.AAPL.rolling(250).mean().plot()
plt.figure()
appl_std250 = close_px.AAPL.rolling(250, min_periods=10).std()
appl_std250[5:12]
appl_std250.plot()
expanding_mean = appl_std250.expanding().mean()
plt.figure()
close_px.rolling(60).mean().plot(logy=True)
close_px.rolling('20D').mean()
plt.figure()
aapl_px = close_px.AAPL['2006':'2007']
ma60 = aapl_px.rolling(30, min_periods=20).mean()
ewma60 = aapl_px.ewm(span=30).mean()
ma60.plot(style='k--', label='Simple MA')
ewma60.plot(style='k-', label='EW MA')
plt.legend()
plt.figure()
spx_px = close_px_all['SPX']
spx_rets = spx_px.pct_change()
returns = close_px.pct_change()
corr = returns.AAPL.rolling(125, min_periods=100).corr(spx_rets)
corr.plot()
plt.figure()
corr = returns.rolling(125, min_periods=100).corr(spx_rets)
corr.plot()
plt.figure()
from scipy.stats import percentileofscore
score_at_2percent = lambda x: percentileofscore(x, 0.02)
result = returns.AAPL.rolling(250).apply(score_at_2percent)
result.plot()
# Importing the dataset
df = pd.read_csv('AirPassengers.csv')
df
df.info()
df.describe().T
df.index
# Parse strings to datetime type
df['Month'] = pd.to_datetime(df['Month'], infer_datetime_format=True) #convert from string to datetime
df = df.set_index('Month')
df
df.columns
# Checking the Stationarity
plt.plot(df);
#Test whether Timeseries is Stationary or not
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(window=12).mean()
rolstd = timeseries.rolling(window=12).std()
#Plot rolling statistics:
print ('Rolling Statistics Test:')
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Augmented-Dickey-Fuller (ADF) test:
print ('Results of Augmented-Dickey-Fuller (ADF) Test:')
dftest = adfuller(timeseries['#Passengers'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
if dfoutput.loc['p-value'] < 0.05:
print('\n>> Reject H0, the dataset can be stationary')
else:
print('\n>> Failed to Reject H0, the dataset can be non-stationary!')
test_stationarity(df)
df_log = np.log(df)
df_log.dropna(inplace=True)
test_stationarity(df_log)
moving_avg = df_log.rolling(window=12).mean()
df_moving_avg = df_log - moving_avg
df_moving_avg.dropna(inplace=True)
test_stationarity(df_moving_avg)
exp_moving_avg = df_log.ewm(halflife=12, min_periods=0, adjust=True).mean()
df_exp_moving_avg = df_log - exp_moving_avg
df_exp_moving_avg.dropna(inplace=True)
test_stationarity(df_exp_moving_avg)
df_log_shift = df_log - df_log.shift()
df_log_shift.dropna(inplace=True)
test_stationarity(df_log_shift)
import statsmodels.api as sm
df = pd.read_excel("Sample - Superstore.xls")
furniture = df.loc[df['Category'] == 'Furniture']
df.describe()
furniture['Order Date'].min(), furniture['Order Date'].max()
furniture.isnull().sum()
furniture = furniture.groupby('Order Date')['Sales'].sum().reset_index()
# Indexing with time series data
furniture = furniture.set_index('Order Date')
furniture.index
y = furniture['Sales'].resample('MS').mean()
y['2017':]
y.plot(figsize=(15, 6))
plt.show()
from pylab import rcParams
rcParams['figure.figsize'] = 18, 8
decomposition = sm.tsa.seasonal_decompose(y, model='additive')
fig = decomposition.plot()
plt.show()
import itertools
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(y,
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic));
except:
continue
mod = sm.tsa.statespace.SARIMAX(y,
order=(1, 1, 1),
seasonal_order=(1, 1, 0, 12),
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])
results.plot_diagnostics(figsize=(16, 8))
plt.show()
pred = results.get_prediction(start=pd.to_datetime('2017-01-01'), dynamic=False)
pred_ci = pred.conf_int()
ax = y['2014':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(pred_ci.index,
pred_ci.iloc[:, 0],
pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Furniture Sales')
plt.legend()
plt.show()
# Importing the required libraries:
import nltk
# Importing the dataset
# nltk.download_shell()
import tensorflow as tf
df = pd.read_csv('fake_reg.csv')
df.head()
df.describe().T
sns.pairplot(df);
X = df[['feature1', 'feature2']].values
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
X_train.max()
Keep in mind what kind of problem you are trying to solve:
# For a multi-class classification problem
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
# For a binary classification problem
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
# For a mean squared error regression problem
model.compile(optimizer='rmsprop',
loss='mse')
## Building a Neural Network Model
model_ = tf.keras.Sequential([tf.keras.layers.Dense(units=4, activation='relu'),
tf.keras.layers.Dense(units=2, activation='relu'),
tf.keras.layers.Dense(units=1)]) # First layer with 4 Neurons, nest with 2 and the final Output layer.
# Another way to do build such a model is:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()
# Add Layers now:
model.add(Dense(units=4, activation='relu')) # A Rectlinear Activation Function Layer with 4 Neurons
model.add(Dense(units=2, activation='relu')) # A Rectlinear Activation Function Layer with 4 Neurons
model.add(Dense(units=1)) # Final Output Layer
model.compile(optimizer='rmsprop', loss='mse')
Below are some common definitions that are necessary to know and understand to correctly utilize Keras:
model.fit(X_train,y_train,epochs=250)
Let's evaluate our performance on our training set and our test set. We can compare these two performances to check for overfitting.
model.history.history
loss = model.history.history['loss']
sns.lineplot(x=range(len(loss)),y=loss)
plt.title("Training Loss per Epoch");
model.metrics_names
# Loss on the Training data
model.evaluate(X_train, y_train)
# Loss on the Test data
model.evaluate(X_test, y_test)
test_predictions = model.predict(X_test)
test_predictions
We will be using data from a Kaggle data set:
https://www.kaggle.com/harlfoxem/housesalesprediction
df = pd.read_csv('kc_house_data.csv')
df
df.describe().T
sns.distplot(df['price']);
sns.countplot(df['bedrooms']);
plt.figure(figsize=(12,8))
sns.scatterplot(x='long',y='lat',
data=df,hue='price',
palette='RdYlGn',edgecolor=None,alpha=0.2);
non_top_1_perc = df.sort_values('price',ascending=False).iloc[216:]
df.drop('id', axis=1, inplace=True)
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].apply(lambda date: date.year)
df['month'] = df['date'].apply(lambda date: date.month)
sns.boxplot(x='year',y='price',data=df)
sns.boxplot(x='month',y='price',data=df)
df.groupby('month').mean()['price'].plot()
df.groupby('year').mean()['price'].plot()
df = df.drop('date',axis=1)
df.columns
# https://i.pinimg.com/originals/4a/ab/31/4aab31ce95d5b8474fd2cc063f334178.jpg
# May be worth considering to remove this or feature engineer categories from it
df['zipcode'].value_counts()
df = df.drop('zipcode',axis=1)
df.head()
# could make sense due to scaling, higher should correlate to more value
df['yr_renovated'].value_counts()
df['sqft_basement'].value_counts()
X = df.drop('price',axis=1)
y = df['price']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape
X_test.shape
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Dense(19, activation='relu'))
model.add(Dense(19, activation='relu'))
model.add(Dense(19, activation='relu'))
model.add(Dense(19, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=400);
https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score
X_test
predictions = model.predict(X_test)
mean_absolute_error(y_test,predictions)
np.sqrt(mean_squared_error(y_test,predictions))
explained_variance_score(y_test,predictions)
df['price'].mean()
df['price'].median()
# Our predictions
plt.scatter(y_test,predictions)
# Perfect predictions
plt.plot(y_test,y_test,'r')
errors = y_test.values.reshape(6480, 1) - predictions
sns.distplot(errors)
single_house = df.drop('price',axis=1).iloc[0]
single_house = scaler.transform(single_house.values.reshape(-1, 19))
single_house
model.predict(single_house)
df.iloc[0]
Let's explore a classification task with Keras API for TF 2.0
Data Set Characteristics:
:Number of Instances: 569
:Number of Attributes: 30 numeric, predictive attributes and the class
:Attribute Information:
- radius (mean of distances from center to points on the perimeter)
- texture (standard deviation of gray-scale values)
- perimeter
- area
- smoothness (local variation in radius lengths)
- compactness (perimeter^2 / area - 1.0)
- concavity (severity of concave portions of the contour)
- concave points (number of concave portions of the contour)
- symmetry
- fractal dimension ("coastline approximation" - 1)
The mean, standard error, and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.
- class:
- WDBC-Malignant
- WDBC-Benign
:Summary Statistics:
===================================== ====== ======
Min Max
===================================== ====== ======
radius (mean): 6.981 28.11
texture (mean): 9.71 39.28
perimeter (mean): 43.79 188.5
area (mean): 143.5 2501.0
smoothness (mean): 0.053 0.163
compactness (mean): 0.019 0.345
concavity (mean): 0.0 0.427
concave points (mean): 0.0 0.201
symmetry (mean): 0.106 0.304
fractal dimension (mean): 0.05 0.097
radius (standard error): 0.112 2.873
texture (standard error): 0.36 4.885
perimeter (standard error): 0.757 21.98
area (standard error): 6.802 542.2
smoothness (standard error): 0.002 0.031
compactness (standard error): 0.002 0.135
concavity (standard error): 0.0 0.396
concave points (standard error): 0.0 0.053
symmetry (standard error): 0.008 0.079
fractal dimension (standard error): 0.001 0.03
radius (worst): 7.93 36.04
texture (worst): 12.02 49.54
perimeter (worst): 50.41 251.2
area (worst): 185.2 4254.0
smoothness (worst): 0.071 0.223
compactness (worst): 0.027 1.058
concavity (worst): 0.0 1.252
concave points (worst): 0.0 0.291
symmetry (worst): 0.156 0.664
fractal dimension (worst): 0.055 0.208
===================================== ====== ======
:Missing Attribute Values: None
:Class Distribution: 212 - Malignant, 357 - Benign
:Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian
:Donor: Nick Street
:Date: November, 1995
This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets. https://goo.gl/U2Uwz2
Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.
Separating plane described above was obtained using Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree Construction Via Linear Programming." Proceedings of the 4th Midwest Artificial Intelligence and Cognitive Science Society, pp. 97-101, 1992], a classification method which uses linear programming to construct a decision tree. Relevant features were selected using an exhaustive search in the space of 1-4 features and 1-3 separating planes.
The actual linear program used to obtain the separating plane in the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: "Robust Linear Programming Discrimination of Two Linearly Inseparable Sets", Optimization Methods and Software 1, 1992, 23-34].
This database is also available through the UW CS ftp server:
ftp ftp.cs.wisc.edu cd math-prog/cpo-dataset/machine-learn/WDBC/
.. topic:: References
df = pd.read_csv('../DATA/cancer_classification.csv')
df.info()
df.describe().transpose()
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='benign_0__mal_1',data=df)
sns.heatmap(df.corr())
df.corr()['benign_0__mal_1'].sort_values()
df.corr()['benign_0__mal_1'].sort_values().plot(kind='bar')
df.corr()['benign_0__mal_1'][:-1].sort_values().plot(kind='bar')
X = df.drop('benign_0__mal_1',axis=1).values
y = df['benign_0__mal_1'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# For a binary classification problem
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
X_train.shape
model = Sequential()
# https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
model.add(Dense(units=30,activation='relu'))
model.add(Dense(units=15,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))
# For a binary classification problem
model.compile(loss='binary_crossentropy', optimizer='adam')
# https://stats.stackexchange.com/questions/164876/tradeoff-batch-size-vs-number-of-iterations-to-train-a-neural-network
# https://datascience.stackexchange.com/questions/18414/are-there-any-rules-for-choosing-the-size-of-a-mini-batch
model.fit(x=X_train,
y=y_train,
epochs=600,
validation_data=(X_test, y_test), verbose=1
)
# model.history.history
model_loss = pd.DataFrame(model.history.history)
# model_loss
model_loss.plot()
We obviously trained too much! Let's use early stopping to track the val_loss and stop training once it begins increasing too much!
model = Sequential()
model.add(Dense(units=30,activation='relu'))
model.add(Dense(units=15,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
from tensorflow.keras.callbacks import EarlyStopping
Stop training when a monitored quantity has stopped improving.
Arguments:
monitor: Quantity to be monitored.
min_delta: Minimum change in the monitored quantity
to qualify as an improvement, i.e. an absolute
change of less than min_delta, will count as no
improvement.
patience: Number of epochs with no improvement
after which training will be stopped.
verbose: verbosity mode.
mode: One of `{"auto", "min", "max"}`. In `min` mode,
training will stop when the quantity
monitored has stopped decreasing; in `max`
mode it will stop when the quantity
monitored has stopped increasing; in `auto`
mode, the direction is automatically inferred
from the name of the monitored quantity.
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)
model.fit(x=X_train,
y=y_train,
epochs=600,
validation_data=(X_test, y_test), verbose=1,
callbacks=[early_stop]
)
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()
from tensorflow.keras.layers import Dropout
model = Sequential()
model.add(Dense(units=30,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=15,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit(x=X_train,
y=y_train,
epochs=600,
validation_data=(X_test, y_test), verbose=1,
callbacks=[early_stop]
)
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()
predictions = model.predict_classes(X_test)
from sklearn.metrics import classification_report,confusion_matrix
# https://en.wikipedia.org/wiki/Precision_and_recall
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
n = int(input())
sv = list(map(int, input().split()))
mc = list(map(int, input().split()))
sv.sort()
mc.sort()
i = 0
while i < n:
if mc[i] < sv[i]:
i += 1
continue
else:
print('NO')
break
else:
print('YES')
try:
s = int(input())
d = int(input())
mn, mx = (10**(d-1)), (10**d - 1)
def sm(n):
r = 0
while n:
r, n = r + n % 10, n // 10
return r
for i in range(mn, mx+1):
if sm(i) == s:
print('Smallest number is',i)
break
else:
print('Not possible')
except:
print('Not possible')
n = int(input())
d = list(map(int, input().split()))
def checkt(degree, n):
smd = sum(degree)
if (2*(n-1) == smd):
return True
else:
return False
if checkt(d):
print('Yes')
else:
print('No')
#!/usr/bin/env python
# coding: utf-8
# In[103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
# In[104]:
df=pd.read_csv('1 (2).csv')
# In[105]:
df.head()
# In[106]:
x=df['Time'][1:]
y=df['Average(A)'][1:]
# In[107]:
x.dtype
# In[108]:
y.dtype
# In[109]:
x = x.astype(float, errors = 'raise')
y = y.astype(float, errors = 'raise')
# In[110]:
sns.lineplot(x,y)
# In[111]:
peaks= find_peaks(y, height = 1,prominence=3, distance = 1)
height = peaks[1]['peak_heights'] #list of the heights of the peaks
peak_pos = x[peaks[0]] #list of the peaks positions
# In[112]:
print(height)
# In[113]:
print(peaks[0])
# In[114]:
print(peak_pos)
# In[118]:
plt.plot(peaks[0], y[peaks[0]], "xr")
plt.plot(y)
# plt.legend(['prominence'])
# In[119]:
from astropy import modeling
# In[147]:
fitter = modeling.fitting.LevMarLSQFitter()
model = modeling.models.Gaussian1D(amplitude=200, mean=0.026846, stddev=0.783768)
# depending on the data you need to give some initial values
fitted_model = fitter(model, x, y)
# In[148]:
plt.plot(x, y)
plt.plot(x, fitted_model(x))
# In[ ]: